#!/usr/bin/perl
use POSIX;
use strict;
use List::Util qw(max);
no warnings 'experimental'; 

use Cwd 'abs_path';
use Getopt::Long;


my $genomeDatabaseFileName = "genome_database.csv";

#### Parameters
my $MAX_ORTH=16;
my $MIN_EVAL= 10;
my $MAX_EVAL = 150;
my $EVAL_CUTOFF_FOR_FAMILY=0.66;
my $MAX_HIGH_SENSITIVITY_PROTEIN_LENGTH=120;
##################

my $conservatoryDir=abs_path(".");
my $help=0;
my $genome="";
my $referenceGenome="";
my $referenceGenomeFamily="";
my $referenceGenomeSpecies="";
my $ref2gZipped=0;
my $g2refZipped =0;
my $family="";
my $species;
my $genomefound = 0;
my $verbose =0;
my %proteinsLengths;
my $geneProcessingREGEX="";
my $referenceGeneProcessingREGEX="";
my %genesWithFootprint;
my $blastLineRead=0;
my $blastHitForLocusCount;
my $minIdentity=0;

my $curReadLocus="";

my %genome2RefHits;
my %ref2GenomeHits;

my $outputOrthologFileName="";

$|++;

my $line;
GetOptions ("conservatoryDirectory=s" => \$conservatoryDir,
			"genome-database=s" => \$genomeDatabaseFileName,
			"genome=s" => \$genome,
			"reference=s" => \$referenceGenome,
			"max-eval=i" => \$MAX_EVAL,
			"min-eval=s" => \$MIN_EVAL,
			"min-identity=i" => \$minIdentity,			
			"output-ortholog-file=s" => \$outputOrthologFileName,
			"verbose" => \$verbose,
			"help" => \$help) or die("Error in command line arguments\n");
			
			
if( $genome eq "" || $help) {
	print "Conservatory version 0.0.1\n\n";
	
	print "buildOrthologDB\n\n";
	print "\t--conservatoryDirectory\tPath of the main conservatory directory. See README for directory structure. (DEFAULT: current directory)\n";
	print "\t--genome-database\t\tGenome database file name (DEFAULT: genome_database.csv).\n";	
	print "\t--reference\t\tReference genome name (for global mode). \n";
	print "\t--genome\t\tGenome name for which to build putative ortholog databse (REQUIRED). \n";
	print "\t--verbose\t\tPrint extra infromation.\n\n";
    	exit();
}
my $genomedb_file = "$conservatoryDir/$genomeDatabaseFileName";

##### Sanity checks.
die "ERROR: Cannot find file genome database file ($genomedb_file)\n" unless -e $genomedb_file;
die "ERROR: Conservatory directory structure at ($conservatoryDir) is corrupt\n" unless (-e "$conservatoryDir/genomes/tmpblast");

######### Find out what is the reference genome and the processing REGEX
open(my $genomedb, "<", $genomedb_file);

while(<$genomedb>) {
	chomp;
	my @genomeDBLine = split /,/;
	if($genomeDBLine[0] eq $genome) {
		$family = $genomeDBLine[2];
		$species = $genomeDBLine[1];
		$geneProcessingREGEX = $genomeDBLine[9];
	}
	if($genomeDBLine[0] eq $referenceGenome) {
		$referenceGenomeFamily = $genomeDBLine[2];
		$referenceGenomeSpecies = $genomeDBLine[1];
		$referenceGeneProcessingREGEX = $genomeDBLine[9];
	}
}

close($genomedb);

die "ERROR: Cannot file genome $genome in database.\n" unless $family ne "";
die "ERROR: Cannot file reference genome $genome in database.\n" unless $referenceGenomeFamily ne "";

my $genome2refFileName = "$conservatoryDir/genomes/tmpblast/$genome" . "2" . "$referenceGenome" . ".txt";
my $ref2genomeFileName = "$conservatoryDir/genomes/tmpblast/$referenceGenome" . "2" . "$genome" . ".txt";

### If the files are compressed
if (!( -e $genome2refFileName)) {
	$g2refZipped=1;
	$genome2refFileName .= ".gz";
}
if (!( -e $ref2genomeFileName)) {
	$ref2gZipped =1;
	$ref2genomeFileName .= ".gz";
}

if($outputOrthologFileName eq "") {
	$outputOrthologFileName = "$conservatoryDir/genomes/$family/$referenceGenome.$genome.orthologs.csv";
}
my $footprintGFFFileName = "$conservatoryDir/genomes/$family/$genome.footprint.gff3";
my $proteinSequenceFileName = "$conservatoryDir/genomes/$referenceGenomeFamily/$referenceGenome.proteins.fasta";

open (my $gffFile, "<", $footprintGFFFileName ) or die "ERROR: Cannot open footprint file $footprintGFFFileName.\n";
open (my $proteinFile, "<", $proteinSequenceFileName ) or die "ERROR: Cannot open protein sequence file $proteinSequenceFileName.\n";

my $g2ref;
my $ref2g;

if($g2refZipped) {
	open ($g2ref, "zcat $genome2refFileName |" ) or die "ERROR: Cannot open blast results file $genome2refFileName.\n";
} else {
	open ($g2ref, "<", $genome2refFileName ) or die "ERROR: Cannot open blast results file $genome2refFileName.\n";
}

if($ref2gZipped) {
	open ($ref2g,"zcat $ref2genomeFileName |" ) or die "ERROR: Cannot open blast results file $ref2genomeFileName.\n";
} else {
	open ($ref2g, "<", $ref2genomeFileName) or die "ERROR: Cannot open blast results file $ref2genomeFileName.\n";
}

open (my $output, ">", "$outputOrthologFileName.tmp");

### Set up the filter of acceptable genes (those that have promoter sequences) 
while(my $line = <$gffFile>) {
	chomp($line);
	my @array = split /\t/, $line;
	my %genename = split /[;=]/, $array[8];
	$genesWithFootprint{ $genename{'Name'} } = 1;
}
close($gffFile);

### Set up a list of high-sensitivity genes (short genes, usually peptides, which show have a different cut off for orthology)

my $proteinSeq;
my $geneName;
while(my $line = <$proteinFile>) {
	chomp($line);
	if(substr($line,0,1) eq ">") {
		if($geneName ne "") {
			$proteinsLengths{"$geneName"}= length($proteinSeq);
		}
		$proteinSeq = "";
		$geneName = substr($line,1);
		$geneName =~ s/ .*//;
		if($referenceGeneProcessingREGEX ne "") { eval '$geneName =~ s/$referenceGeneProcessingREGEX//g;'; }
		$geneName = "$referenceGenomeSpecies-$referenceGenome-$geneName";
	} else {
		$proteinSeq .= $line;
	}	
}
$proteinsLengths{"$geneName"}= length($proteinSeq);

close($proteinSequenceFileName);

#load the whole genome to reference blast table into a hash

print localtime() . ": PROGRESS : Start reading genome to reference blast.";

###### Read genome 2 reference hits
while(my $curline = <$g2ref>) {

	chomp($curline);
	my @line_s = split(/\t/, $curline);
	my $locus = $line_s[0];
	my $refLocus = $line_s[1];
	my $identity = $line_s[2];
	my $eval = $line_s[10];
	my $logeval = logEval($eval);

	### Translate protein name to  full and unique gene name
	if($referenceGeneProcessingREGEX ne "") { eval '$refLocus =~ s/$referenceGeneProcessingREGEX//g;'; }
	if($geneProcessingREGEX ne "") { eval '$locus =~ s/$geneProcessingREGEX//g;'; }
	$locus = "$species-$genome-$locus";
	$refLocus = "$referenceGenomeSpecies-$referenceGenome-$refLocus";
	
	if($curReadLocus ne $locus) {
		$curReadLocus= $locus;
		$blastHitForLocusCount=0;
	}
	if(($eval < $MIN_EVAL) && (defined $genesWithFootprint{ $locus }) && $identity > $minIdentity) {

		if(defined $genome2RefHits{$refLocus}{$locus}) {
			if($logeval > $genome2RefHits{$refLocus}{$locus}) {
				$genome2RefHits{$refLocus}{$locus} = $logeval;
			}
		} else {
			$genome2RefHits{$refLocus}{$locus} = $logeval;
		}
	}
	if(($blastLineRead++ % 500000) ==0) { print "."; }
}
print "\n";
print localtime() . ": PROGRESS : Done reading genome to reference blast. Read " . (scalar (keys %genome2RefHits)) . " pairs.\n";

print localtime() . ": PROGRESS : Start reading reference to genome blast.";

###### Read REFERENCE 2 genome hits
while(my $curline = <$ref2g>) {

	chomp($curline);
	my @line_s = split(/\t/, $curline);
	my $refLocus = $line_s[0];
	my $locus = $line_s[1];
	my $identity = $line_s[2];
	my $eval = $line_s[10];
	my $logeval = logEval($eval);

	### Translate protein name to  full and unique gene name
	if($referenceGeneProcessingREGEX ne "") { eval '$refLocus =~ s/$referenceGeneProcessingREGEX//g;'; }
	if($geneProcessingREGEX ne "") { eval '$locus =~ s/$geneProcessingREGEX//g;'; }

	$locus = "$species-$genome-$locus";
	$refLocus = "$referenceGenomeSpecies-$referenceGenome-$refLocus";

	if($curReadLocus ne $refLocus) {
		$curReadLocus= $refLocus;
		$blastHitForLocusCount=0;
	}

	if($eval < $MIN_EVAL && $identity> $minIdentity) {

		if(defined $ref2GenomeHits{$refLocus}{$locus}) {
			if($logeval > $ref2GenomeHits{$refLocus}{$locus}) {
				$ref2GenomeHits{$refLocus}{$locus} = $logeval;
			}
		} else {
			$ref2GenomeHits{$refLocus}{$locus} = $logeval;
		}
	}
	if(($blastLineRead++ % 500000)==0) { print ".";}

}
print "\n";
print localtime() . ": PROGRESS : Done reading genome to reference blast. Read " . (scalar (keys %ref2GenomeHits)) . " pairs.\n";

my %orthologTable;

### Add the evalues
foreach my $curRefLocus (keys %genome2RefHits) {
	foreach my $curLocus (keys %{ $genome2RefHits{$curRefLocus} } ) {
		$orthologTable{$curRefLocus}{$curLocus} = $genome2RefHits{$curRefLocus}{$curLocus} +  $ref2GenomeHits{$curRefLocus}{$curLocus};
	}
}

print localtime() . ": PROGRESS : Start writing orthologs.\n";
my $orthologPairsCount=0;

### Select cutoff

foreach my $curRefLocus (keys %orthologTable) {
	my $cutoff;
	my @allevals;
	my %orthologsForRefLocus;

	foreach my $curLocus (keys %{ $orthologTable{$curRefLocus}} ) {
		my $key = $curLocus . "," . $curRefLocus;
		$orthologsForRefLocus{$key}= $orthologTable{$curRefLocus}{$curLocus};
		#push(@allevals, $orthologTable{$curRefLocus}{$curLocus});
	}
	my @sortedOrthologPairs = reverse (sort { $orthologsForRefLocus{$a} <=> $orthologsForRefLocus{$b} } keys %orthologsForRefLocus );

	$cutoff = $orthologsForRefLocus{$sortedOrthologPairs[0]} * $EVAL_CUTOFF_FOR_FAMILY;

	### Unless its a small protein. then, no cutoff
	if(($proteinsLengths{"$curRefLocus"} <= $MAX_HIGH_SENSITIVITY_PROTEIN_LENGTH)) { # Unless we are processing a high sensitivity gene (small peptides, usually)
											 # where we have a absolute cutoff.
		$cutoff = 0;
	}
	my $outputOrthologCount=0;

	### Output ortholog pairs
	foreach my $curOrthologPair (@sortedOrthologPairs) {
		if($orthologsForRefLocus{$curOrthologPair} >= $cutoff && $outputOrthologCount++ < $MAX_ORTH) {
			print $output ("$curOrthologPair," . $orthologsForRefLocus{$curOrthologPair}  . "\n");
			$orthologPairsCount++;
		}
	}

}
		
close($output);

print localtime() . ": PROGRESS : Wrote $orthologPairsCount putative ortholog pairs.\n";

## Finally filter for possible duplicates. This shouldn't happen, but sometimes does with odd annotation of spliceforms
system("awk -F ',' '!seen[\$1\$2]++' $outputOrthologFileName.tmp > $outputOrthologFileName");
unlink("$outputOrthologFileName.tmp");


sub logEval {
	my ($eval) = @_;
	my $logeval;

	if($eval ==0) { 
		$logeval = $MAX_EVAL;
	} else {
		$logeval = -log($eval);
		if($logeval > $MAX_EVAL) {
			$logeval = $MAX_EVAL;
		}
	}
	return($logeval);
}